"""
PDF Download Service for Research Library

Handles downloading PDFs from various academic sources with:
- Deduplication using download tracker
- Source-specific download strategies (arXiv, PubMed, etc.)
- Progress tracking and error handling
- File organization and storage
"""

import hashlib
import re
import time
import uuid
from datetime import datetime, UTC
from pathlib import Path
from typing import Optional, Tuple
from urllib.parse import urlparse

import requests
from loguru import logger
from sqlalchemy.orm import Session
import pdfplumber
from pypdf import PdfReader

from ...database.models.download_tracker import (
    DownloadAttempt,
    DownloadTracker,
)
from ...security import redact_data
from ...database.models.library import (
    Collection,
    Document as Document,
    DocumentStatus,
    DownloadQueue as LibraryDownloadQueue,
    DocumentCollection,
)
from .pdf_storage_manager import PDFStorageManager
from ...database.models.research import ResearchResource
from ...database.library_init import get_source_type_id, get_default_library_id
from ...database.session_context import get_user_db_session
from ...utilities.db_utils import get_settings_manager
from ...library.download_management import RetryManager
from ..utils import (
    get_url_hash,
    get_absolute_path_from_settings,
)

# Import our modular downloaders
from ..downloaders import (
    ContentType,
    ArxivDownloader,
    PubMedDownloader,
    BioRxivDownloader,
    DirectPDFDownloader,
    SemanticScholarDownloader,
    OpenAlexDownloader,
    GenericDownloader,
)


class DownloadService:
    """Service for downloading and managing research PDFs."""

    def __init__(self, username: str, password: Optional[str] = None):
        """Initialize download service for a user.

        Args:
            username: The username to download for
            password: Optional password for encrypted database access
        """
        self.username = username
        self.password = password
        self.settings = get_settings_manager(username=username)

        # Debug settings manager and user context
        logger.info(
            f"[DOWNLOAD_SERVICE] Settings manager initialized: {type(self.settings)}, username: {self.username}"
        )

        # Get library path from settings
        storage_path_setting = self.settings.get_setting(
            "research_library.storage_path",
            "~/.local-deep-research/library",
        )
        logger.warning(
            f"[DOWNLOAD_SERVICE_INIT] Storage path setting retrieved: {storage_path_setting} (type: {type(storage_path_setting)})"
        )

        if storage_path_setting is None:
            logger.error(
                "[DOWNLOAD_SERVICE_INIT] CRITICAL: storage_path_setting is None!"
            )
            raise ValueError("Storage path setting cannot be None")

        self.library_root = str(Path(storage_path_setting).expanduser())
        logger.warning(
            f"[DOWNLOAD_SERVICE_INIT] Library root resolved to: {self.library_root}"
        )

        # Create directory structure
        self._setup_directories()

        # Initialize modular downloaders
        # DirectPDFDownloader first for efficiency with direct PDF links

        # Get Semantic Scholar API key from settings
        semantic_scholar_api_key = self.settings.get_setting(
            "search.engine.web.semantic_scholar.api_key", ""
        )

        self.downloaders = [
            DirectPDFDownloader(timeout=30),  # Handle direct PDF links first
            SemanticScholarDownloader(
                timeout=30,
                api_key=semantic_scholar_api_key
                if semantic_scholar_api_key
                else None,
            ),
            OpenAlexDownloader(
                timeout=30
            ),  # OpenAlex with API lookup (no key needed)
            ArxivDownloader(timeout=30),
            PubMedDownloader(timeout=30, rate_limit_delay=1.0),
            BioRxivDownloader(timeout=30),
            GenericDownloader(timeout=30),  # Generic should be last (fallback)
        ]

        # Initialize retry manager for smart failure tracking
        self.retry_manager = RetryManager(username, password)
        logger.info(
            f"[DOWNLOAD_SERVICE] Initialized retry manager for user: {username}"
        )

        # PubMed rate limiting state
        self._pubmed_delay = 1.0  # 1 second delay for PubMed
        self._last_pubmed_request = 0.0  # Track last request time

    def _setup_directories(self):
        """Create library directory structure."""
        # Only create the root and pdfs folder - flat structure
        paths = [
            self.library_root,
            str(Path(self.library_root) / "pdfs"),
        ]
        for path in paths:
            Path(path).mkdir(parents=True, exist_ok=True)

    def _normalize_url(self, url: str) -> str:
        """Normalize URL for consistent hashing."""
        # Remove protocol variations
        url = re.sub(r"^https?://", "", url)
        # Remove www
        url = re.sub(r"^www\.", "", url)
        # Remove trailing slashes
        url = url.rstrip("/")
        # Sort query parameters
        if "?" in url:
            base, query = url.split("?", 1)
            params = sorted(query.split("&"))
            url = f"{base}?{'&'.join(params)}"
        return url.lower()

    def _get_url_hash(self, url: str) -> str:
        """Generate SHA256 hash of normalized URL."""
        normalized = self._normalize_url(url)
        return get_url_hash(normalized)

    def is_already_downloaded(self, url: str) -> Tuple[bool, Optional[str]]:
        """
        Check if URL is already downloaded.

        Returns:
            Tuple of (is_downloaded, file_path)
        """
        url_hash = self._get_url_hash(url)

        with get_user_db_session(self.username, self.password) as session:
            tracker = (
                session.query(DownloadTracker)
                .filter_by(url_hash=url_hash, is_downloaded=True)
                .first()
            )

            if tracker and tracker.file_path:
                # Compute absolute path and verify file still exists
                absolute_path = get_absolute_path_from_settings(
                    tracker.file_path
                )
                if absolute_path.exists():
                    return True, str(absolute_path)
                else:
                    # File was deleted, mark as not downloaded
                    tracker.is_downloaded = False
                    session.commit()

            return False, None

    def get_text_content(self, resource_id: int) -> Optional[str]:
        """
        Get text content for a research resource.

        This will try to:
        1. Fetch text directly from APIs if available
        2. Extract text from downloaded PDF if exists
        3. Download PDF and extract text if not yet downloaded

        Args:
            resource_id: ID of the research resource

        Returns:
            Text content as string, or None if extraction failed
        """
        with get_user_db_session(self.username, self.password) as session:
            resource = session.query(ResearchResource).get(resource_id)
            if not resource:
                logger.error(f"Resource {resource_id} not found")
                return None

            url = resource.url

            # Find appropriate downloader
            for downloader in self.downloaders:
                if downloader.can_handle(url):
                    logger.info(
                        f"Using {downloader.__class__.__name__} for text extraction from {url}"
                    )
                    try:
                        # Try to get text content
                        text = downloader.download_text(url)
                        if text:
                            logger.info(
                                f"Successfully extracted text for: {resource.title[:50]}"
                            )
                            return text
                    except Exception:
                        logger.exception("Failed to extract text")
                    break

            logger.warning(f"Could not extract text for {url}")
            return None

    def queue_research_downloads(
        self, research_id: str, collection_id: Optional[str] = None
    ) -> int:
        """
        Queue all downloadable PDFs from a research session.

        Args:
            research_id: The research session ID
            collection_id: Optional target collection ID (defaults to Library if not provided)

        Returns:
            Number of items queued
        """
        queued = 0

        # Get default library collection if no collection_id provided
        if not collection_id:
            from ...database.library_init import get_default_library_id

            collection_id = get_default_library_id(self.username)

        with get_user_db_session(self.username, self.password) as session:
            # Get all resources for this research
            resources = (
                session.query(ResearchResource)
                .filter_by(research_id=research_id)
                .all()
            )

            for resource in resources:
                if self._is_downloadable(resource):
                    # Check if already queued
                    existing_queue = (
                        session.query(LibraryDownloadQueue)
                        .filter_by(
                            resource_id=resource.id,
                            status=DocumentStatus.PENDING,
                        )
                        .first()
                    )

                    # Check if already downloaded (trust the database status)
                    existing_doc = (
                        session.query(Document)
                        .filter_by(
                            resource_id=resource.id,
                            status=DocumentStatus.COMPLETED,
                        )
                        .first()
                    )

                    # Queue if not already queued and not marked as completed
                    if not existing_queue and not existing_doc:
                        # Check one more time if ANY queue entry exists (regardless of status)
                        any_queue = (
                            session.query(LibraryDownloadQueue)
                            .filter_by(resource_id=resource.id)
                            .first()
                        )

                        if any_queue:
                            # Reset the existing queue entry
                            any_queue.status = DocumentStatus.PENDING
                            any_queue.research_id = research_id
                            any_queue.collection_id = collection_id
                            queued += 1
                        else:
                            # Add new queue entry
                            queue_entry = LibraryDownloadQueue(
                                resource_id=resource.id,
                                research_id=research_id,
                                collection_id=collection_id,
                                priority=0,
                                status=DocumentStatus.PENDING,
                            )
                            session.add(queue_entry)
                            queued += 1

            session.commit()
            logger.info(
                f"Queued {queued} downloads for research {research_id} to collection {collection_id}"
            )

        return queued

    def _is_downloadable(self, resource: ResearchResource) -> bool:
        """Check if a resource is likely downloadable as PDF."""
        url = resource.url.lower() if resource.url else ""

        # Check for PDF extensions
        if url.endswith(".pdf"):
            return True

        # Check for known academic sources with downloadable PDFs
        downloadable_domains = [
            "arxiv.org",
            "biorxiv.org",
            "medrxiv.org",
            "ncbi.nlm.nih.gov/pmc",  # PubMed Central has PDFs
            "pubmed.ncbi.nlm.nih.gov",  # Try to find PMC version
            "semanticscholar.org",
            "researchgate.net",
            "academia.edu",
        ]

        return any(domain in url for domain in downloadable_domains)

    def download_resource(self, resource_id: int) -> Tuple[bool, Optional[str]]:
        """
        Download a specific resource.

        Returns:
            Tuple of (success: bool, skip_reason: str or None)
        """
        with get_user_db_session(self.username, self.password) as session:
            resource = session.query(ResearchResource).get(resource_id)
            if not resource:
                logger.error(f"Resource {resource_id} not found")
                return False, "Resource not found"

            # Check if already downloaded (trust the database after sync)
            existing_doc = (
                session.query(Document)
                .filter_by(
                    resource_id=resource_id, status=DocumentStatus.COMPLETED
                )
                .first()
            )

            if existing_doc:
                logger.info(
                    "Resource already downloaded (according to database)"
                )
                return True, None

            # Get collection_id from queue entry if it exists
            queue_entry = (
                session.query(LibraryDownloadQueue)
                .filter_by(resource_id=resource_id)
                .first()
            )
            collection_id = (
                queue_entry.collection_id
                if queue_entry and queue_entry.collection_id
                else None
            )

            # Create download tracker entry
            url_hash = self._get_url_hash(resource.url)
            tracker = (
                session.query(DownloadTracker)
                .filter_by(url_hash=url_hash)
                .first()
            )

            if not tracker:
                tracker = DownloadTracker(
                    url=resource.url,
                    url_hash=url_hash,
                    first_resource_id=resource.id,
                    is_downloaded=False,
                )
                session.add(tracker)
                session.commit()

            # Attempt download
            result = self._download_pdf(
                resource, tracker, session, collection_id
            )

            # Get skip reason if failed
            skip_reason = None
            if isinstance(result, tuple):
                success, skip_reason = result
            else:
                success = result
                if not success:
                    # Try to get skip reason from last attempt
                    last_attempt = (
                        session.query(DownloadAttempt)
                        .filter_by(url_hash=tracker.url_hash)
                        .order_by(DownloadAttempt.attempted_at.desc())
                        .first()
                    )
                    if last_attempt and hasattr(last_attempt, "error_message"):
                        skip_reason = last_attempt.error_message

            # Record attempt with retry manager for smart failure tracking
            self.retry_manager.record_attempt(
                resource_id=resource.id,
                result=(success, skip_reason),
                url=resource.url,
                details=skip_reason
                or (
                    "Successfully downloaded" if success else "Download failed"
                ),
                session=session,
            )

            # Update queue status if exists
            queue_entry = (
                session.query(LibraryDownloadQueue)
                .filter_by(resource_id=resource_id)
                .first()
            )

            if queue_entry:
                queue_entry.status = (
                    DocumentStatus.COMPLETED
                    if success
                    else DocumentStatus.FAILED
                )
                queue_entry.completed_at = datetime.now(UTC)

            session.commit()
            return success, skip_reason

    def _download_pdf(
        self,
        resource: ResearchResource,
        tracker: DownloadTracker,
        session: Session,
        collection_id: Optional[str] = None,
    ) -> Tuple[bool, Optional[str]]:
        """
        Perform the actual PDF download.

        Args:
            resource: The research resource to download
            tracker: Download tracker for this URL
            session: Database session
            collection_id: Optional target collection ID (defaults to Library if not provided)

        Returns:
            Tuple of (success: bool, skip_reason: Optional[str])
        """
        url = resource.url

        # Log attempt
        attempt = DownloadAttempt(
            url_hash=tracker.url_hash,
            attempt_number=tracker.download_attempts.count() + 1
            if hasattr(tracker, "download_attempts")
            else 1,
            attempted_at=datetime.now(UTC),
        )
        session.add(attempt)

        try:
            # Use modular downloaders with detailed skip reasons
            pdf_content = None
            downloader_used = None
            skip_reason = None

            for downloader in self.downloaders:
                if downloader.can_handle(url):
                    logger.info(
                        f"Using {downloader.__class__.__name__} for {url}"
                    )
                    result = downloader.download_with_result(
                        url, ContentType.PDF
                    )
                    downloader_used = downloader.__class__.__name__

                    if result.is_success and result.content:
                        pdf_content = result.content
                        break
                    elif result.skip_reason:
                        skip_reason = result.skip_reason
                        logger.info(f"Download skipped: {skip_reason}")
                        # Keep trying other downloaders unless it's the GenericDownloader
                        if isinstance(downloader, GenericDownloader):
                            break

            if not downloader_used:
                logger.error(f"No downloader found for {url}")
                skip_reason = "No compatible downloader available"

            if not pdf_content:
                error_msg = skip_reason or "Failed to download PDF content"
                # Store skip reason in attempt for retrieval
                attempt.error_message = error_msg
                attempt.succeeded = False
                session.commit()
                logger.info(f"Download failed with reason: {error_msg}")
                return False, error_msg

            # Get PDF storage mode setting
            pdf_storage_mode = self.settings.get_setting(
                "research_library.pdf_storage_mode", "none"
            )
            max_pdf_size_mb = int(
                self.settings.get_setting(
                    "research_library.max_pdf_size_mb", 100
                )
            )
            logger.info(
                f"[DOWNLOAD_SERVICE] PDF storage mode: {pdf_storage_mode}"
            )

            # Update tracker
            import hashlib

            tracker.file_hash = hashlib.sha256(pdf_content).hexdigest()
            tracker.file_size = len(pdf_content)
            tracker.is_downloaded = True
            tracker.downloaded_at = datetime.now(UTC)

            # Initialize PDF storage manager
            pdf_storage_manager = PDFStorageManager(
                library_root=self.library_root,
                storage_mode=pdf_storage_mode,
                max_pdf_size_mb=max_pdf_size_mb,
            )

            # Update attempt with success info
            attempt.succeeded = True

            # Check if library document already exists
            existing_doc = (
                session.query(Document)
                .filter_by(resource_id=resource.id)
                .first()
            )

            if existing_doc:
                # Update existing document
                existing_doc.document_hash = tracker.file_hash
                existing_doc.file_size = len(pdf_content)
                existing_doc.status = DocumentStatus.COMPLETED
                existing_doc.processed_at = datetime.now(UTC)

                # Save PDF using storage manager (updates storage_mode and file_path)
                file_path_result, _ = pdf_storage_manager.save_pdf(
                    pdf_content=pdf_content,
                    document=existing_doc,
                    session=session,
                    filename=f"{resource.id}.pdf",
                    url=url,
                    resource_id=resource.id,
                )

                # Update tracker
                tracker.file_path = (
                    file_path_result if file_path_result else None
                )
                tracker.file_name = (
                    Path(file_path_result).name
                    if file_path_result and file_path_result != "database"
                    else None
                )
            else:
                # Get source type ID for research downloads
                try:
                    source_type_id = get_source_type_id(
                        self.username, "research_download"
                    )
                    # Use provided collection_id or default to Library
                    library_collection_id = (
                        collection_id or get_default_library_id(self.username)
                    )
                except Exception:
                    logger.exception(
                        "Failed to get source type or library collection"
                    )
                    raise

                # Create new unified document entry
                doc_id = str(uuid.uuid4())
                doc = Document(
                    id=doc_id,
                    source_type_id=source_type_id,
                    resource_id=resource.id,
                    research_id=resource.research_id,
                    document_hash=tracker.file_hash,
                    original_url=url,
                    file_size=len(pdf_content),
                    file_type="pdf",
                    mime_type="application/pdf",
                    title=resource.title,
                    status=DocumentStatus.COMPLETED,
                    processed_at=datetime.now(UTC),
                    storage_mode=pdf_storage_mode,
                )
                session.add(doc)
                session.flush()  # Ensure doc.id is available for blob storage

                # Save PDF using storage manager (updates storage_mode and file_path)
                file_path_result, _ = pdf_storage_manager.save_pdf(
                    pdf_content=pdf_content,
                    document=doc,
                    session=session,
                    filename=f"{resource.id}.pdf",
                    url=url,
                    resource_id=resource.id,
                )

                # Update tracker
                tracker.file_path = (
                    file_path_result if file_path_result else None
                )
                tracker.file_name = (
                    Path(file_path_result).name
                    if file_path_result and file_path_result != "database"
                    else None
                )

                # Link document to default Library collection
                doc_collection = DocumentCollection(
                    document_id=doc_id,
                    collection_id=library_collection_id,
                    indexed=False,
                )
                session.add(doc_collection)

            # Update attempt
            attempt.succeeded = True
            attempt.bytes_downloaded = len(pdf_content)

            if pdf_storage_mode == "database":
                logger.info(
                    f"Successfully stored PDF in database: {resource.url}"
                )
            elif pdf_storage_mode == "filesystem":
                logger.info(f"Successfully downloaded: {tracker.file_path}")
            else:
                logger.info(f"Successfully extracted text from: {resource.url}")

            # Automatically extract and save text after successful PDF download
            try:
                logger.info(
                    f"Extracting text from downloaded PDF for: {resource.title[:50]}"
                )
                text = self._extract_text_from_pdf(pdf_content)

                if text:
                    # Get the document ID we just created/updated
                    pdf_doc = (
                        session.query(Document)
                        .filter_by(resource_id=resource.id)
                        .first()
                    )
                    pdf_document_id = pdf_doc.id if pdf_doc else None

                    # Save text to encrypted database
                    self._save_text_with_db(
                        resource=resource,
                        text=text,
                        session=session,
                        extraction_method="pdf_extraction",
                        extraction_source="local_pdf",
                        pdf_document_id=pdf_document_id,
                    )
                    logger.info(
                        f"Successfully extracted and saved text for: {resource.title[:50]}"
                    )
                else:
                    logger.warning(
                        f"Text extraction returned empty text for: {resource.title[:50]}"
                    )
            except Exception as e:
                logger.exception(
                    f"Failed to extract text from PDF, but PDF download succeeded: {e}"
                )
                # Don't fail the entire download if text extraction fails

            return True, None

        except Exception as e:
            logger.exception(f"Download failed for {url}")
            attempt.succeeded = False
            attempt.error_type = type(e).__name__
            attempt.error_message = str(e)
            tracker.is_accessible = False
            # Sanitize error message before returning to API
            safe_error = redact_data(str(e))
            return False, safe_error

    def _extract_text_from_pdf(self, pdf_content: bytes) -> Optional[str]:
        """
        Extract text from PDF content using multiple methods for best results.

        Args:
            pdf_content: Raw PDF bytes

        Returns:
            Extracted text or None if extraction fails
        """
        try:
            # First try with pdfplumber (better for complex layouts)
            import io

            with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
                text_parts = []
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text_parts.append(page_text)

                if text_parts:
                    return "\n\n".join(text_parts)

            # Fallback to PyPDF if pdfplumber fails
            reader = PdfReader(io.BytesIO(pdf_content))
            text_parts = []
            for page in reader.pages:
                text = page.extract_text()
                if text:
                    text_parts.append(text)

            if text_parts:
                return "\n\n".join(text_parts)

            logger.warning("No text could be extracted from PDF")
            return None

        except Exception:
            logger.exception("Failed to extract text from PDF")
            return None

    def download_as_text(self, resource_id: int) -> Tuple[bool, Optional[str]]:
        """
        Download resource and extract text to encrypted database.

        Args:
            resource_id: ID of the resource to download

        Returns:
            Tuple of (success, error_message)
        """
        with get_user_db_session(self.username, self.password) as session:
            # Get the resource
            resource = (
                session.query(ResearchResource)
                .filter_by(id=resource_id)
                .first()
            )
            if not resource:
                return False, "Resource not found"

            # Try existing text in database
            result = self._try_existing_text(session, resource_id)
            if result is not None:
                return result

            # Try legacy text files on disk
            result = self._try_legacy_text_file(session, resource, resource_id)
            if result is not None:
                return result

            # Try extracting from existing PDF
            result = self._try_existing_pdf_extraction(
                session, resource, resource_id
            )
            if result is not None:
                return result

            # Try API text extraction
            result = self._try_api_text_extraction(session, resource)
            if result is not None:
                return result

            # Fallback: Download PDF and extract
            return self._fallback_pdf_extraction(session, resource)

    def _try_existing_text(
        self, session, resource_id: int
    ) -> Optional[Tuple[bool, Optional[str]]]:
        """Check if text already exists in database (in Document.text_content)."""
        existing_doc = (
            session.query(Document).filter_by(resource_id=resource_id).first()
        )

        if not existing_doc:
            return None

        # Check if text content exists and extraction was successful
        if (
            existing_doc.text_content
            and existing_doc.extraction_method
            and existing_doc.extraction_method != "failed"
        ):
            logger.info(
                f"Text content already exists in Document for resource_id={resource_id}, extraction_method={existing_doc.extraction_method}"
            )
            return True, None

        # No text content or failed extraction
        logger.debug(
            f"Document exists but no valid text content: resource_id={resource_id}, extraction_method={existing_doc.extraction_method}"
        )
        return None  # Fall through to re-extraction

    def _try_legacy_text_file(
        self, session, resource, resource_id: int
    ) -> Optional[Tuple[bool, Optional[str]]]:
        """Check for legacy text files on disk."""
        txt_path = Path(self.library_root) / "txt"
        existing_files = (
            list(txt_path.glob(f"*_{resource_id}.txt"))
            if txt_path.exists()
            else []
        )

        if not existing_files:
            return None

        logger.info(f"Text file already exists on disk: {existing_files[0]}")
        self._create_text_document_record(
            session,
            resource,
            existing_files[0],
            extraction_method="unknown",
            extraction_source="legacy_file",
        )
        session.commit()
        return True, None

    def _try_existing_pdf_extraction(
        self, session, resource, resource_id: int
    ) -> Optional[Tuple[bool, Optional[str]]]:
        """Try extracting text from existing PDF in database."""
        pdf_document = (
            session.query(Document).filter_by(resource_id=resource_id).first()
        )

        if not pdf_document or pdf_document.status != "completed":
            return None

        pdf_path = Path(self.library_root) / pdf_document.file_path
        if not pdf_path.exists():
            return None

        logger.info(f"Found existing PDF, extracting text from: {pdf_path}")
        try:
            with open(pdf_path, "rb") as f:
                pdf_content = f.read()
            text = self._extract_text_from_pdf(pdf_content)

            if not text:
                return None

            self._save_text_with_db(
                resource,
                text,
                session,
                extraction_method="pdf_extraction",
                extraction_source="pdfplumber",
                pdf_document_id=pdf_document.id,
            )
            session.commit()
            return True, None

        except Exception:
            logger.exception(
                f"Failed to extract text from existing PDF: {pdf_path}"
            )
            return None  # Fall through to other methods

    def _try_api_text_extraction(
        self, session, resource
    ) -> Optional[Tuple[bool, Optional[str]]]:
        """Try direct API text extraction."""
        logger.info(
            f"Attempting direct API text extraction from: {resource.url}"
        )

        downloader = self._get_downloader(resource.url)
        if not downloader:
            return None

        result = downloader.download_with_result(resource.url, ContentType.TEXT)

        if not result.is_success or not result.content:
            return None

        # Decode text content
        text = (
            result.content.decode("utf-8", errors="ignore")
            if isinstance(result.content, bytes)
            else result.content
        )

        # Determine extraction source
        extraction_source = "unknown"
        if isinstance(downloader, ArxivDownloader):
            extraction_source = "arxiv_api"
        elif isinstance(downloader, PubMedDownloader):
            extraction_source = "pubmed_api"

        try:
            self._save_text_with_db(
                resource,
                text,
                session,
                extraction_method="native_api",
                extraction_source=extraction_source,
            )
            session.commit()
            logger.info(
                f"✓ SUCCESS: Got text from {extraction_source.upper()} API for '{resource.title[:50]}...'"
            )
            return True, None
        except Exception as e:
            logger.exception(f"Failed to save text for resource {resource.id}")
            # Sanitize error message before returning to API
            safe_error = redact_data(f"Failed to save text: {str(e)}")
            return False, safe_error

    def _fallback_pdf_extraction(
        self, session, resource
    ) -> Tuple[bool, Optional[str]]:
        """Fallback: Download PDF to memory and extract text."""
        logger.info(
            f"API text extraction failed, falling back to in-memory PDF download for: {resource.url}"
        )

        downloader = self._get_downloader(resource.url)
        if not downloader:
            error_msg = "No compatible downloader found"
            logger.warning(
                f"✗ FAILED: {error_msg} for '{resource.title[:50]}...'"
            )
            self._record_failed_text_extraction(
                session, resource, error=error_msg
            )
            session.commit()
            return False, error_msg

        result = downloader.download_with_result(resource.url, ContentType.PDF)

        if not result.is_success or not result.content:
            error_msg = result.skip_reason or "Failed to download PDF"
            logger.warning(
                f"✗ FAILED: Could not download PDF for '{resource.title[:50]}...' | Error: {error_msg}"
            )
            self._record_failed_text_extraction(
                session, resource, error=f"PDF download failed: {error_msg}"
            )
            session.commit()
            return False, f"PDF extraction failed: {error_msg}"

        # Extract text from PDF
        text = self._extract_text_from_pdf(result.content)
        if not text:
            error_msg = "PDF text extraction returned empty text"
            logger.warning(
                f"Failed to extract text from PDF for: {resource.url}"
            )
            self._record_failed_text_extraction(
                session, resource, error=error_msg
            )
            session.commit()
            return False, error_msg

        try:
            self._save_text_with_db(
                resource,
                text,
                session,
                extraction_method="pdf_extraction",
                extraction_source="pdfplumber_fallback",
            )
            session.commit()
            logger.info(
                f"✓ SUCCESS: Extracted text from '{resource.title[:50]}...'"
            )
            return True, None
        except Exception as e:
            logger.exception(f"Failed to save text for resource {resource.id}")
            # Sanitize error message before returning to API
            safe_error = redact_data(f"Failed to save text: {str(e)}")
            return False, safe_error

    def _get_downloader(self, url: str):
        """
        Get the appropriate downloader for a URL.

        Args:
            url: The URL to download from

        Returns:
            The appropriate downloader instance or None
        """
        for downloader in self.downloaders:
            if downloader.can_handle(url):
                return downloader
        return None

    def _download_generic(self, url: str) -> Optional[bytes]:
        """Generic PDF download method."""
        try:
            headers = {
                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
            }
            response = requests.get(url, headers=headers, timeout=30)
            response.raise_for_status()

            # Verify it's a PDF
            content_type = response.headers.get("Content-Type", "")
            if (
                "pdf" not in content_type.lower()
                and not response.content.startswith(b"%PDF")
            ):
                logger.warning(f"Response is not a PDF: {content_type}")
                return None

            return response.content

        except Exception:
            logger.exception("Generic download failed")
            return None

    def _download_arxiv(self, url: str) -> Optional[bytes]:
        """Download from arXiv."""
        try:
            # Convert abstract URL to PDF URL
            pdf_url = url.replace("abs", "pdf")
            if not pdf_url.endswith(".pdf"):
                pdf_url += ".pdf"

            return self._download_generic(pdf_url)
        except Exception:
            logger.exception("arXiv download failed")
            return None

    def _try_europe_pmc(self, pmid: str) -> Optional[bytes]:
        """Try downloading from Europe PMC which often has better PDF availability."""
        try:
            # Europe PMC API is more reliable for PDFs
            # Check if PDF is available
            api_url = f"https://www.ebi.ac.uk/europepmc/webservices/rest/search?query=EXT_ID:{pmid}&format=json"
            response = requests.get(api_url, timeout=10)

            if response.status_code == 200:
                data = response.json()
                results = data.get("resultList", {}).get("result", [])

                if results:
                    article = results[0]
                    # Check if article has open access PDF
                    if (
                        article.get("isOpenAccess") == "Y"
                        and article.get("hasPDF") == "Y"
                    ):
                        pmcid = article.get("pmcid")
                        if pmcid:
                            # Europe PMC PDF URL
                            pdf_url = f"https://europepmc.org/backend/ptpmcrender.fcgi?accid={pmcid}&blobtype=pdf"
                            logger.info(
                                f"Found Europe PMC PDF for PMID {pmid}: {pmcid}"
                            )

                            headers = {
                                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
                            }

                            pdf_response = requests.get(
                                pdf_url, headers=headers, timeout=30
                            )
                            if pdf_response.status_code == 200:
                                content_type = pdf_response.headers.get(
                                    "content-type", ""
                                )
                                if (
                                    "pdf" in content_type.lower()
                                    or len(pdf_response.content) > 1000
                                ):
                                    return pdf_response.content
        except Exception as e:
            logger.debug(f"Europe PMC download failed: {e}")

        return None

    def _download_pubmed(self, url: str) -> Optional[bytes]:
        """Download from PubMed/PubMed Central with rate limiting."""
        try:
            # Apply rate limiting for PubMed requests
            current_time = time.time()
            time_since_last = current_time - self._last_pubmed_request
            if time_since_last < self._pubmed_delay:
                sleep_time = self._pubmed_delay - time_since_last
                logger.debug(
                    f"Rate limiting: sleeping {sleep_time:.2f}s before PubMed request"
                )
                time.sleep(sleep_time)
            self._last_pubmed_request = time.time()

            # If it's already a PMC article, download directly
            if "/articles/PMC" in url:
                pmc_match = re.search(r"(PMC\d+)", url)
                if pmc_match:
                    pmc_id = pmc_match.group(1)

                    # Try Europe PMC (more reliable)
                    europe_url = f"https://europepmc.org/backend/ptpmcrender.fcgi?accid={pmc_id}&blobtype=pdf"
                    headers = {
                        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
                    }

                    try:
                        response = requests.get(
                            europe_url, headers=headers, timeout=30
                        )
                        if response.status_code == 200:
                            content_type = response.headers.get(
                                "content-type", ""
                            )
                            if (
                                "pdf" in content_type.lower()
                                or len(response.content) > 1000
                            ):
                                logger.info(
                                    f"Downloaded PDF via Europe PMC for {pmc_id}"
                                )
                                return response.content
                    except Exception as e:
                        logger.debug(f"Direct Europe PMC download failed: {e}")
                        return None

            # If it's a regular PubMed URL, try to find PMC version
            elif urlparse(url).hostname == "pubmed.ncbi.nlm.nih.gov":
                # Extract PMID from URL
                pmid_match = re.search(r"/(\d+)/?", url)
                if pmid_match:
                    pmid = pmid_match.group(1)
                    logger.info(f"Attempting to download PDF for PMID: {pmid}")

                    # Try Europe PMC first (more reliable)
                    pdf_content = self._try_europe_pmc(pmid)
                    if pdf_content:
                        return pdf_content

                    # First try using NCBI E-utilities API to find PMC ID
                    try:
                        # Use elink to convert PMID to PMCID
                        elink_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi"
                        params = {
                            "dbfrom": "pubmed",
                            "db": "pmc",
                            "id": pmid,
                            "retmode": "json",
                        }

                        api_response = requests.get(
                            elink_url, params=params, timeout=10
                        )
                        if api_response.status_code == 200:
                            data = api_response.json()
                            # Parse the response to find PMC ID
                            link_sets = data.get("linksets", [])
                            if link_sets and "linksetdbs" in link_sets[0]:
                                for linksetdb in link_sets[0]["linksetdbs"]:
                                    if linksetdb.get(
                                        "dbto"
                                    ) == "pmc" and linksetdb.get("links"):
                                        pmc_id_num = linksetdb["links"][0]
                                        # Now fetch PMC details to get the correct PMC ID format
                                        esummary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
                                        summary_params = {
                                            "db": "pmc",
                                            "id": pmc_id_num,
                                            "retmode": "json",
                                        }
                                        summary_response = requests.get(
                                            esummary_url,
                                            params=summary_params,
                                            timeout=10,
                                        )
                                        if summary_response.status_code == 200:
                                            summary_data = (
                                                summary_response.json()
                                            )
                                            result = summary_data.get(
                                                "result", {}
                                            ).get(str(pmc_id_num), {})
                                            if result:
                                                # PMC IDs in the API don't have the "PMC" prefix
                                                pmc_id = f"PMC{pmc_id_num}"
                                                logger.info(
                                                    f"Found PMC ID via API: {pmc_id} for PMID: {pmid}"
                                                )

                                                # Try Europe PMC with the PMC ID
                                                europe_url = f"https://europepmc.org/backend/ptpmcrender.fcgi?accid={pmc_id}&blobtype=pdf"

                                                time.sleep(self._pubmed_delay)

                                                headers = {
                                                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
                                                }

                                                try:
                                                    response = requests.get(
                                                        europe_url,
                                                        headers=headers,
                                                        timeout=30,
                                                    )
                                                    if (
                                                        response.status_code
                                                        == 200
                                                    ):
                                                        content_type = response.headers.get(
                                                            "content-type", ""
                                                        )
                                                        if (
                                                            "pdf"
                                                            in content_type.lower()
                                                            or len(
                                                                response.content
                                                            )
                                                            > 1000
                                                        ):
                                                            logger.info(
                                                                f"Downloaded PDF via Europe PMC for {pmc_id}"
                                                            )
                                                            return (
                                                                response.content
                                                            )
                                                except Exception as e:
                                                    logger.debug(
                                                        f"Europe PMC download with PMC ID failed: {e}"
                                                    )
                    except Exception as e:
                        logger.debug(
                            f"API lookup failed, trying webpage scraping: {e}"
                        )

                    # Fallback to webpage scraping if API fails
                    try:
                        headers = {
                            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
                        }
                        response = requests.get(
                            url, headers=headers, timeout=10
                        )
                        if response.status_code == 200:
                            # Look for PMC ID in the page
                            pmc_match = re.search(r"PMC\d+", response.text)
                            if pmc_match:
                                pmc_id = pmc_match.group(0)
                                logger.info(
                                    f"Found PMC ID via webpage: {pmc_id} for PMID: {pmid}"
                                )

                                # Add delay before downloading PDF
                                time.sleep(self._pubmed_delay)

                                # Try Europe PMC with the PMC ID (more reliable)
                                europe_url = f"https://europepmc.org/backend/ptpmcrender.fcgi?accid={pmc_id}&blobtype=pdf"
                                headers = {
                                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
                                }

                                try:
                                    response = requests.get(
                                        europe_url, headers=headers, timeout=30
                                    )
                                    if response.status_code == 200:
                                        content_type = response.headers.get(
                                            "content-type", ""
                                        )
                                        if (
                                            "pdf" in content_type.lower()
                                            or len(response.content) > 1000
                                        ):
                                            logger.info(
                                                f"Downloaded PDF via Europe PMC for {pmc_id}"
                                            )
                                            return response.content
                                except Exception as e:
                                    logger.debug(
                                        f"Europe PMC download failed: {e}"
                                    )
                            else:
                                logger.info(
                                    f"No PMC version found for PMID: {pmid}"
                                )
                    except requests.exceptions.HTTPError as e:
                        if e.response.status_code == 429:
                            logger.warning(
                                "Rate limited by PubMed, increasing delay"
                            )
                            self._pubmed_delay = min(
                                self._pubmed_delay * 2, 5.0
                            )  # Max 5 seconds
                        raise
                    except Exception as e:
                        logger.debug(f"Could not check for PMC version: {e}")

            return self._download_generic(url)
        except Exception:
            logger.exception("PubMed download failed")
            return None

    def _download_semantic_scholar(self, url: str) -> Optional[bytes]:
        """Download from Semantic Scholar."""
        # Semantic Scholar doesn't host PDFs directly
        # Would need to extract actual PDF URL from page
        return None

    def _download_biorxiv(self, url: str) -> Optional[bytes]:
        """Download from bioRxiv."""
        try:
            # Convert to PDF URL
            pdf_url = url.replace(".org/", ".org/content/")
            pdf_url = re.sub(r"v\d+$", "", pdf_url)  # Remove version
            pdf_url += ".full.pdf"

            return self._download_generic(pdf_url)
        except Exception:
            logger.exception("bioRxiv download failed")
            return None

    def _download_medrxiv(self, url: str) -> Optional[bytes]:
        """Download from medRxiv."""
        # Same as bioRxiv
        return self._download_biorxiv(url)

    def _save_text_with_db(
        self,
        resource: ResearchResource,
        text: str,
        session: Session,
        extraction_method: str,
        extraction_source: str,
        pdf_document_id: Optional[int] = None,
    ) -> Optional[str]:
        """
        Save extracted text to encrypted database.

        Args:
            resource: The research resource
            text: Extracted text content
            session: Database session
            extraction_method: How the text was extracted
            extraction_source: Specific tool/API used
            pdf_document_id: ID of PDF document if extracted from PDF

        Returns:
            None (previously returned text file path, now removed)
        """
        try:
            # Calculate text metadata for database
            word_count = len(text.split())
            character_count = len(text)

            # Find the document by pdf_document_id or resource_id
            doc = None
            if pdf_document_id:
                doc = (
                    session.query(Document)
                    .filter_by(id=pdf_document_id)
                    .first()
                )
            else:
                doc = (
                    session.query(Document)
                    .filter_by(resource_id=resource.id)
                    .first()
                )

            if doc:
                # Update existing document with extracted text
                doc.text_content = text
                doc.character_count = character_count
                doc.word_count = word_count
                doc.extraction_method = extraction_method
                doc.extraction_source = extraction_source

                # Set quality based on method
                if extraction_method == "native_api":
                    doc.extraction_quality = "high"
                elif (
                    extraction_method == "pdf_extraction"
                    and extraction_source == "pdfplumber"
                ):
                    doc.extraction_quality = "medium"
                else:
                    doc.extraction_quality = "low"

                logger.debug(
                    f"Updated document {doc.id} with extracted text ({word_count} words)"
                )
            else:
                # Create a new Document for text-only extraction
                # Generate hash from text content
                text_hash = hashlib.sha256(text.encode()).hexdigest()

                # Get source type for research downloads
                try:
                    source_type_id = get_source_type_id(
                        self.username, "research_download"
                    )
                except Exception:
                    logger.exception(
                        "Failed to get source type for text document"
                    )
                    raise

                # Create new document
                doc_id = str(uuid.uuid4())
                doc = Document(
                    id=doc_id,
                    source_type_id=source_type_id,
                    resource_id=resource.id,
                    research_id=resource.research_id,
                    document_hash=text_hash,
                    original_url=resource.url,
                    file_path="text_only_not_stored",
                    file_size=character_count,  # Use character count as file size for text-only
                    file_type="text",
                    mime_type="text/plain",
                    title=resource.title,
                    text_content=text,
                    character_count=character_count,
                    word_count=word_count,
                    extraction_method=extraction_method,
                    extraction_source=extraction_source,
                    extraction_quality="high"
                    if extraction_method == "native_api"
                    else "medium",
                    status=DocumentStatus.COMPLETED,
                    processed_at=datetime.now(UTC),
                )
                session.add(doc)

                # Link to default Library collection
                library_collection = (
                    session.query(Collection).filter_by(name="Library").first()
                )
                if library_collection:
                    doc_collection = DocumentCollection(
                        document_id=doc_id,
                        collection_id=library_collection.id,
                        indexed=False,
                        chunk_count=0,
                    )
                    session.add(doc_collection)
                else:
                    logger.warning(
                        f"Library collection not found - document {doc_id} will not be linked to default collection"
                    )

                logger.info(
                    f"Created new document {doc_id} for text-only extraction ({word_count} words)"
                )

            logger.info(
                f"Saved text to encrypted database ({word_count} words)"
            )
            return None

        except Exception:
            logger.exception("Error saving text to encrypted database")
            raise  # Re-raise so caller can handle the error

    def _create_text_document_record(
        self,
        session: Session,
        resource: ResearchResource,
        file_path: Path,
        extraction_method: str,
        extraction_source: str,
    ):
        """Update existing Document with text from file (for legacy text files)."""
        try:
            # Read file to get metadata
            text = file_path.read_text(encoding="utf-8", errors="ignore")
            word_count = len(text.split())
            character_count = len(text)

            # Find the Document by resource_id
            doc = (
                session.query(Document)
                .filter_by(resource_id=resource.id)
                .first()
            )

            if doc:
                # Update existing document with text content
                doc.text_content = text
                doc.character_count = character_count
                doc.word_count = word_count
                doc.extraction_method = extraction_method
                doc.extraction_source = extraction_source
                doc.extraction_quality = (
                    "low"  # Unknown quality for legacy files
                )
                logger.info(
                    f"Updated document {doc.id} with text from file: {file_path.name}"
                )
            else:
                logger.warning(
                    f"No document found to update for resource {resource.id}"
                )

        except Exception:
            logger.exception("Error updating document with text from file")

    def _record_failed_text_extraction(
        self, session: Session, resource: ResearchResource, error: str
    ):
        """Record a failed text extraction attempt in the Document."""
        try:
            # Find the Document by resource_id
            doc = (
                session.query(Document)
                .filter_by(resource_id=resource.id)
                .first()
            )

            if doc:
                # Update document with extraction error
                doc.error_message = error
                doc.extraction_method = "failed"
                doc.extraction_quality = "low"
                logger.info(
                    f"Recorded failed text extraction for document {doc.id}: {error}"
                )
            else:
                logger.warning(
                    f"No document found to record extraction failure for resource {resource.id}"
                )

        except Exception:
            logger.exception("Error recording failed text extraction")
